/*-------------------------------------------------------------------------
 *
 * tsquery_cleanup.c
 *     Cleanup query from NOT values and/or stopword
 *     Utility functions to correct work.
 *
 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
 *
 *
 * IDENTIFICATION
 *      src/backend/utils/adt/tsquery_cleanup.c
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include "tsearch/ts_utils.h"
#include "miscadmin.h"

typedef struct NODE
{
    struct NODE *left;
    struct NODE *right;
    QueryItem  *valnode;
} NODE;

/*
 * make query tree from plain view of query
 */
static NODE *
maketree(QueryItem *in)
{
    NODE       *node = (NODE *) palloc(sizeof(NODE));

    /* since this function recurses, it could be driven to stack overflow. */
    check_stack_depth();

    node->valnode = in;
    node->right = node->left = NULL;
    if (in->type == QI_OPR)
    {
        node->right = maketree(in + 1);
        if (in->qoperator.oper != OP_NOT)
            node->left = maketree(in + in->qoperator.left);
    }
    return node;
}

/*
 * Internal state for plaintree and plainnode
 */
typedef struct
{
    QueryItem  *ptr;
    int            len;            /* allocated size of ptr */
    int            cur;            /* number of elements in ptr */
} PLAINTREE;

static void
plainnode(PLAINTREE *state, NODE *node)
{
    /* since this function recurses, it could be driven to stack overflow. */
    check_stack_depth();

    if (state->cur == state->len)
    {
        state->len *= 2;
        state->ptr = (QueryItem *) repalloc((void *) state->ptr, state->len * sizeof(QueryItem));
    }
    memcpy((void *) &(state->ptr[state->cur]), (void *) node->valnode, sizeof(QueryItem));
    if (node->valnode->type == QI_VAL)
        state->cur++;
    else if (node->valnode->qoperator.oper == OP_NOT)
    {
        state->ptr[state->cur].qoperator.left = 1;
        state->cur++;
        plainnode(state, node->right);
    }
    else
    {
        int            cur = state->cur;

        state->cur++;
        plainnode(state, node->right);
        state->ptr[cur].qoperator.left = state->cur - cur;
        plainnode(state, node->left);
    }
    pfree(node);
}

/*
 * make plain view of tree from a NODE-tree representation
 */
static QueryItem *
plaintree(NODE *root, int *len)
{
    PLAINTREE    pl;

    pl.cur = 0;
    pl.len = 16;
    if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR))
    {
        pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem));
        plainnode(&pl, root);
    }
    else
        pl.ptr = NULL;
    *len = pl.cur;
    return pl.ptr;
}

static void
freetree(NODE *node)
{
    /* since this function recurses, it could be driven to stack overflow. */
    check_stack_depth();

    if (!node)
        return;
    if (node->left)
        freetree(node->left);
    if (node->right)
        freetree(node->right);
    pfree(node);
}

/*
 * clean tree for ! operator.
 * It's useful for debug, but in
 * other case, such view is used with search in index.
 * Operator ! always return TRUE
 */
static NODE *
clean_NOT_intree(NODE *node)
{// #lizard forgives
    /* since this function recurses, it could be driven to stack overflow. */
    check_stack_depth();

    if (node->valnode->type == QI_VAL)
        return node;

    if (node->valnode->qoperator.oper == OP_NOT)
    {
        freetree(node);
        return NULL;
    }

    /* operator & or | */
    if (node->valnode->qoperator.oper == OP_OR)
    {
        if ((node->left = clean_NOT_intree(node->left)) == NULL ||
            (node->right = clean_NOT_intree(node->right)) == NULL)
        {
            freetree(node);
            return NULL;
        }
    }
    else
    {
        NODE       *res = node;

        Assert(node->valnode->qoperator.oper == OP_AND ||
               node->valnode->qoperator.oper == OP_PHRASE);

        node->left = clean_NOT_intree(node->left);
        node->right = clean_NOT_intree(node->right);
        if (node->left == NULL && node->right == NULL)
        {
            pfree(node);
            res = NULL;
        }
        else if (node->left == NULL)
        {
            res = node->right;
            pfree(node);
        }
        else if (node->right == NULL)
        {
            res = node->left;
            pfree(node);
        }
        return res;
    }
    return node;
}

QueryItem *
clean_NOT(QueryItem *ptr, int *len)
{
    NODE       *root = maketree(ptr);

    return plaintree(clean_NOT_intree(root), len);
}


/*
 * Remove QI_VALSTOP (stopword) nodes from query tree.
 *
 * Returns NULL if the query degenerates to nothing.  Input must not be NULL.
 *
 * When we remove a phrase operator due to removing one or both of its
 * arguments, we might need to adjust the distance of a parent phrase
 * operator.  For example, 'a' is a stopword, so:
 *        (b <-> a) <-> c  should become    b <2> c
 *        b <-> (a <-> c)  should become    b <2> c
 *        (b <-> (a <-> a)) <-> c  should become    b <3> c
 *        b <-> ((a <-> a) <-> c)  should become    b <3> c
 * To handle that, we define two output parameters:
 *        ladd: amount to add to a phrase distance to the left of this node
 *        radd: amount to add to a phrase distance to the right of this node
 * We need two outputs because we could need to bubble up adjustments to two
 * different parent phrase operators.  Consider
 *        w <-> (((a <-> x) <2> (y <3> a)) <-> z)
 * After we've removed the two a's and are considering the <2> node (which is
 * now just x <2> y), we have an ladd distance of 1 that needs to propagate
 * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
 * propagate to the rightmost <->, so that we'll end up with
 *        w <2> ((x <2> y) <4> z)
 * Near the bottom of the tree, we may have subtrees consisting only of
 * stopwords.  The distances of any phrase operators within such a subtree are
 * summed and propagated to both ladd and radd, since we don't know which side
 * of the lowest surviving phrase operator we are in.  The rule is that any
 * subtree that degenerates to NULL must return equal values of ladd and radd,
 * and the parent node dealing with it should incorporate only one of those.
 *
 * Currently, we only implement this adjustment for adjacent phrase operators.
 * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
 * isn't ideal, but there is no way to represent the really desired semantics
 * without some redesign of the tsquery structure.  Certainly it would not be
 * any better to convert that to 'x <2> (y | z)'.  Since this is such a weird
 * corner case, let it go for now.  But we can fix it in cases where the
 * intervening non-phrase operator also gets removed, for example
 * '((x <-> a) | a) <-> y' will become 'x <2> y'.
 */
static NODE *
clean_stopword_intree(NODE *node, int *ladd, int *radd)
{// #lizard forgives
    /* since this function recurses, it could be driven to stack overflow. */
    check_stack_depth();

    /* default output parameters indicate no change in parent distance */
    *ladd = *radd = 0;

    if (node->valnode->type == QI_VAL)
        return node;
    else if (node->valnode->type == QI_VALSTOP)
    {
        pfree(node);
        return NULL;
    }

    Assert(node->valnode->type == QI_OPR);

    if (node->valnode->qoperator.oper == OP_NOT)
    {
        /* NOT doesn't change pattern width, so just report child distances */
        node->right = clean_stopword_intree(node->right, ladd, radd);
        if (!node->right)
        {
            freetree(node);
            return NULL;
        }
    }
    else
    {
        NODE       *res = node;
        bool        isphrase;
        int            ndistance,
                    lladd,
                    lradd,
                    rladd,
                    rradd;

        /* First, recurse */
        node->left = clean_stopword_intree(node->left, &lladd, &lradd);
        node->right = clean_stopword_intree(node->right, &rladd, &rradd);

        /* Check if current node is OP_PHRASE, get its distance */
        isphrase = (node->valnode->qoperator.oper == OP_PHRASE);
        ndistance = isphrase ? node->valnode->qoperator.distance : 0;

        if (node->left == NULL && node->right == NULL)
        {
            /*
             * When we collapse out a phrase node entirely, propagate its own
             * distance into both *ladd and *radd; it is the responsibility of
             * the parent node to count it only once.  Also, for a phrase
             * node, distances coming from children are summed and propagated
             * up to parent (we assume lladd == lradd and rladd == rradd, else
             * rule was broken at a lower level).  But if this isn't a phrase
             * node, take the larger of the two child distances; that
             * corresponds to what TS_execute will do in non-stopword cases.
             */
            if (isphrase)
                *ladd = *radd = lladd + ndistance + rladd;
            else
                *ladd = *radd = Max(lladd, rladd);
            freetree(node);
            return NULL;
        }
        else if (node->left == NULL)
        {
            /* Removing this operator and left subnode */
            /* lladd and lradd are equal/redundant, don't count both */
            if (isphrase)
            {
                /* operator's own distance must propagate to left */
                *ladd = lladd + ndistance + rladd;
                *radd = rradd;
            }
            else
            {
                /* at non-phrase op, just forget the left subnode entirely */
                *ladd = rladd;
                *radd = rradd;
            }
            res = node->right;
            pfree(node);
        }
        else if (node->right == NULL)
        {
            /* Removing this operator and right subnode */
            /* rladd and rradd are equal/redundant, don't count both */
            if (isphrase)
            {
                /* operator's own distance must propagate to right */
                *ladd = lladd;
                *radd = lradd + ndistance + rradd;
            }
            else
            {
                /* at non-phrase op, just forget the right subnode entirely */
                *ladd = lladd;
                *radd = lradd;
            }
            res = node->left;
            pfree(node);
        }
        else if (isphrase)
        {
            /* Absorb appropriate corrections at this level */
            node->valnode->qoperator.distance += lradd + rladd;
            /* Propagate up any unaccounted-for corrections */
            *ladd = lladd;
            *radd = rradd;
        }
        else
        {
            /* We're keeping a non-phrase operator, so ladd/radd remain 0 */
        }

        return res;
    }
    return node;
}

/*
 * Number of elements in query tree
 */
static int32
calcstrlen(NODE *node)
{
    int32        size = 0;

    if (node->valnode->type == QI_VAL)
    {
        size = node->valnode->qoperand.length + 1;
    }
    else
    {
        Assert(node->valnode->type == QI_OPR);

        size = calcstrlen(node->right);
        if (node->valnode->qoperator.oper != OP_NOT)
            size += calcstrlen(node->left);
    }

    return size;
}

/*
 * Remove QI_VALSTOP (stopword) nodes from TSQuery.
 */
TSQuery
cleanup_tsquery_stopwords(TSQuery in)
{
    int32        len,
                lenstr,
                commonlen,
                i;
    NODE       *root;
    int            ladd,
                radd;
    TSQuery        out;
    QueryItem  *items;
    char       *operands;

    if (in->size == 0)
        return in;

    /* eliminate stop words */
    root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd);
    if (root == NULL)
    {
        ereport(NOTICE,
                (errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored")));
        out = palloc(HDRSIZETQ);
        out->size = 0;
        SET_VARSIZE(out, HDRSIZETQ);
        return out;
    }

    /*
     * Build TSQuery from plain view
     */

    lenstr = calcstrlen(root);
    items = plaintree(root, &len);
    commonlen = COMPUTESIZE(len, lenstr);

    out = palloc(commonlen);
    SET_VARSIZE(out, commonlen);
    out->size = len;

    memcpy(GETQUERY(out), items, len * sizeof(QueryItem));

    items = GETQUERY(out);
    operands = GETOPERAND(out);
    for (i = 0; i < out->size; i++)
    {
        QueryOperand *op = (QueryOperand *) &items[i];

        if (op->type != QI_VAL)
            continue;

        memcpy(operands, GETOPERAND(in) + op->distance, op->length);
        operands[op->length] = '\0';
        op->distance = operands - GETOPERAND(out);
        operands += op->length + 1;
    }

    return out;
}
